import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tf_keras

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn import metrics

# Split the training set into 60% and 40% to end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews",
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

train_examples_batch, train_labels_batch = next(iter(train_data.batch(4)))
train_examples_batch

2025-03-24 13:45:16.813132: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:387] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2025-03-24 13:45:16.814999: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.',
       b'Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Mountie telling the people of Dawson City, Yukon to elect themselves a marshal (yes a marshal!) and to enforce the law themselves, then gunfighters battling it out on the streets for control of the town? <br /><br />Nothing even remotely resembling that happened on the Canadian side of the border during the Klondike gold rush. Mr. Mann and company appear to have mistaken Dawson City for Deadwood, the Canadian North for the American Wild West.<br /><br />Canadian viewers be prepared for a Reefer Madness type of enjoyable howl with this ludicrous plot, or, to shake your head in disgust.',
       b'This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.'],
      dtype=object)>

train_labels_batch

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 0, 0, 1])>

analyzer = SentimentIntensityAnalyzer()
print(analyzer.polarity_scores("you cannot be negative"))

{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound': 0.4585}

train_examples_batch, train_labels_batch = next(iter(train_data.batch(1000)))

2025-03-24 13:45:16.907126: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.

score = [0 for x in range(1000)]
for i in range(1000):
    text = train_examples_batch.numpy()[i].decode("utf-8")
    sent = analyzer.polarity_scores(text)['compound']
    if(sent > 0):
        score[i] = 1

print(metrics.classification_report(train_labels_batch, score, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.78      0.53      0.63       490
    positive       0.66      0.85      0.74       510

    accuracy                           0.70      1000
   macro avg       0.72      0.69      0.69      1000
weighted avg       0.71      0.70      0.69      1000

# Token based text embedding trained on English Google News 7B corpus.
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"

hub_layer = hub.KerasLayer(embedding, input_shape=[],
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: shape=(3, 50), dtype=float32, numpy=
array([[ 0.5423195 , -0.0119017 ,  0.06337538,  0.06862972, -0.16776837,
        -0.10581174,  0.16865303, -0.04998824, -0.31148055,  0.07910346,
         0.15442263,  0.01488662,  0.03930153,  0.19772711, -0.12215476,
        -0.04120981, -0.2704109 , -0.21922152,  0.26517662, -0.80739075,
         0.25833532, -0.3100421 ,  0.28683215,  0.1943387 , -0.29036492,
         0.03862849, -0.7844411 , -0.0479324 ,  0.4110299 , -0.36388892,
        -0.58034706,  0.30269456,  0.3630897 , -0.15227164, -0.44391504,
         0.19462997,  0.19528408,  0.05666234,  0.2890704 , -0.28468323,
        -0.00531206,  0.0571938 , -0.3201318 , -0.04418665, -0.08550783,
        -0.55847436, -0.23336391, -0.20782952, -0.03543064, -0.17533456],
       [ 0.56338924, -0.12339553, -0.10862679,  0.7753425 , -0.07667089,
        -0.15752277,  0.01872335, -0.08169781, -0.3521876 ,  0.4637341 ,
        -0.08492756,  0.07166859, -0.00670817,  0.12686075, -0.19326553,
        -0.52626437, -0.3295823 ,  0.14394785,  0.09043556, -0.5417555 ,
         0.02468163, -0.15456742,  0.68333143,  0.09068331, -0.45327246,
         0.23180096, -0.8615696 ,  0.34480393,  0.12838456, -0.58759046,
        -0.4071231 ,  0.23061076,  0.48426893, -0.27128142, -0.5380916 ,
         0.47016326,  0.22572741, -0.00830663,  0.2846242 , -0.304985  ,
         0.04400365,  0.25025874,  0.14867121,  0.40717036, -0.15422426,
        -0.06878027, -0.40825695, -0.3149215 ,  0.09283665, -0.20183425],
       [ 0.7456154 ,  0.21256861,  0.14400336,  0.5233862 ,  0.11032254,
         0.00902788, -0.3667802 , -0.08938274, -0.24165542,  0.33384594,
        -0.11194605, -0.01460047, -0.0071645 ,  0.19562712,  0.00685216,
        -0.24886718, -0.42796347,  0.18620004, -0.05241098, -0.66462487,
         0.13449019, -0.22205497,  0.08633006,  0.43685386,  0.2972681 ,
         0.36140734, -0.7196889 ,  0.05291241, -0.14316116, -0.1573394 ,
        -0.15056328, -0.05988009, -0.08178931, -0.15569411, -0.09303783,
        -0.18971172,  0.07620788, -0.02541647, -0.27134508, -0.3392682 ,
        -0.10296468, -0.27275252, -0.34078008,  0.20083304, -0.26644835,
         0.00655449, -0.05141488, -0.04261917, -0.45413622,  0.20023568]],
      dtype=float32)>

model = tf_keras.Sequential()
model.add(hub_layer)
model.add(tf_keras.layers.Dense(16, activation='relu'))
model.add(tf_keras.layers.Dense(1))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 keras_layer (KerasLayer)    (None, 50)                48190600  
                                                                 
 dense (Dense)               (None, 16)                816       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 48191433 (183.84 MB)
Trainable params: 48191433 (183.84 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 keras_layer (KerasLayer)    (None, 50)                48190600  
                                                                 
 dense (Dense)               (None, 16)                816       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 48191433 (183.84 MB)
Trainable params: 48191433 (183.84 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=10,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/10
30/30 [==============================] - 5s 167ms/step - loss: 1.2991 - accuracy: 0.5319 - val_loss: 0.7762 - val_accuracy: 0.5618
Epoch 2/10
30/30 [==============================] - 5s 177ms/step - loss: 0.6703 - accuracy: 0.6216 - val_loss: 0.6786 - val_accuracy: 0.6141
Epoch 3/10
30/30 [==============================] - 5s 174ms/step - loss: 0.5646 - accuracy: 0.7145 - val_loss: 0.6355 - val_accuracy: 0.6803
Epoch 4/10
30/30 [==============================] - 5s 179ms/step - loss: 0.4778 - accuracy: 0.7869 - val_loss: 0.5884 - val_accuracy: 0.7247
Epoch 5/10
30/30 [==============================] - 5s 170ms/step - loss: 0.3951 - accuracy: 0.8426 - val_loss: 0.5424 - val_accuracy: 0.7603
Epoch 6/10
30/30 [==============================] - 5s 174ms/step - loss: 0.3218 - accuracy: 0.8828 - val_loss: 0.5176 - val_accuracy: 0.7901
Epoch 7/10
30/30 [==============================] - 5s 179ms/step - loss: 0.2570 - accuracy: 0.9118 - val_loss: 0.5084 - val_accuracy: 0.8135
Epoch 8/10
30/30 [==============================] - 5s 170ms/step - loss: 0.2030 - accuracy: 0.9311 - val_loss: 0.5003 - val_accuracy: 0.8260
Epoch 9/10
30/30 [==============================] - 5s 170ms/step - loss: 0.1861 - accuracy: 0.9359 - val_loss: 0.5321 - val_accuracy: 0.8302
Epoch 10/10
30/30 [==============================] - 5s 168ms/step - loss: 0.1424 - accuracy: 0.9575 - val_loss: 0.5164 - val_accuracy: 0.8348

results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

49/49 - 1s - loss: 0.5679 - accuracy: 0.8169 - 1s/epoch - 24ms/step
loss: 0.568
accuracy: 0.817
loss: 0.568
accuracy: 0.817

embedding = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
                           dtype=tf.string, trainable=True)
# hub_layer(train_examples_batch[:3])

model = tf_keras.Sequential()
model.add(hub_layer)
model.add(tf_keras.layers.Dense(16, activation='relu'))
model.add(tf_keras.layers.Dense(1))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 keras_layer_1 (KerasLayer)  (None, 128)               124642688 
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 124644769 (475.48 MB)
Trainable params: 124644769 (475.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 keras_layer_1 (KerasLayer)  (None, 128)               124642688 
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
=================================================================
Total params: 124644769 (475.48 MB)
Trainable params: 124644769 (475.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=10,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/10
30/30 [==============================] - 15s 463ms/step - loss: 1.0195 - accuracy: 0.5875 - val_loss: 0.5992 - val_accuracy: 0.7271
Epoch 2/10
30/30 [==============================] - 14s 470ms/step - loss: 0.4142 - accuracy: 0.8361 - val_loss: 0.4744 - val_accuracy: 0.8210
Epoch 3/10
30/30 [==============================] - 13s 434ms/step - loss: 0.2581 - accuracy: 0.9127 - val_loss: 0.4361 - val_accuracy: 0.8540
Epoch 4/10
30/30 [==============================] - 13s 424ms/step - loss: 0.1640 - accuracy: 0.9509 - val_loss: 0.4463 - val_accuracy: 0.8732
Epoch 5/10
30/30 [==============================] - 13s 425ms/step - loss: 0.1111 - accuracy: 0.9696 - val_loss: 0.6297 - val_accuracy: 0.8448
Epoch 6/10
30/30 [==============================] - 12s 416ms/step - loss: 0.0970 - accuracy: 0.9758 - val_loss: 0.4966 - val_accuracy: 0.8812
Epoch 7/10
30/30 [==============================] - 13s 425ms/step - loss: 0.0613 - accuracy: 0.9902 - val_loss: 0.5020 - val_accuracy: 0.8819
Epoch 8/10
30/30 [==============================] - 13s 427ms/step - loss: 0.0460 - accuracy: 0.9945 - val_loss: 0.5143 - val_accuracy: 0.8807
Epoch 9/10
30/30 [==============================] - 13s 422ms/step - loss: 0.0366 - accuracy: 0.9964 - val_loss: 0.5305 - val_accuracy: 0.8798
Epoch 10/10
30/30 [==============================] - 13s 418ms/step - loss: 0.0306 - accuracy: 0.9977 - val_loss: 0.5475 - val_accuracy: 0.8795

results = model.evaluate(test_data.batch(512), verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

49/49 - 3s - loss: 0.6082 - accuracy: 0.8548 - 3s/epoch - 52ms/step
loss: 0.608
accuracy: 0.855
loss: 0.608
accuracy: 0.855

Practical 8: Sentiment Analysis¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Let's get started!¶

Lexicon-based sentiment analysis¶

Deep learning-based sentiment analysis¶